library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.6 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✓ broom 0.8.0 ✓ rsample 0.1.1
## ✓ dials 0.1.1 ✓ tune 0.2.0
## ✓ infer 1.0.0 ✓ workflows 0.2.6
## ✓ modeldata 0.1.1 ✓ workflowsets 0.2.1
## ✓ parsnip 0.2.1 ✓ yardstick 0.0.9
## ✓ recipes 0.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(kknn)
library(tidytext)
library(textrecipes)
library(here)
## here() starts at /Users/amitisraeli/peer_read
library(moderndive)
library(ranger)
library(recipes)
library(rsample)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:kknn':
##
## contr.dummy
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
library(tidymodels)
library(kknn)
theme_set(theme_minimal())
scotblue <- "#0065BF"
ukred <- "#D00C27"
loading the processed data that have only all the feachers a nd the recomndation score.
data <- read_csv('Data/clean_data.csv')
## New names:
## Rows: 308 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): conference, title, comments dbl (12): ...1, Date, id, IMPACT, SUBSTANCE,
## APPROPRIATENESS, MEANINGFUL_COM...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
colnames(data)
## [1] "...1" "conference" "Date"
## [4] "title" "id" "IMPACT"
## [7] "SUBSTANCE" "APPROPRIATENESS" "MEANINGFUL_COMPARISON"
## [10] "SOUNDNESS_CORRECTNESS" "ORIGINALITY" "CLARITY"
## [13] "REVIEWER_CONFIDENCE" "RECOMMENDATION" "comments"
numeric_cols <- select_if(data, is.numeric)
drops <- c("id", "index", "Date","...1")
numeric_cols <- numeric_cols[, !(names(numeric_cols) %in% drops)]
data_long <- data %>%
pivot_longer(colnames(numeric_cols)) %>%
as.data.frame()
data_long
Distributions
ggplot(data_long, aes(x = value)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
geom_density(col="#FF0000") +
geom_vline(aes(xintercept = mean(value)), col="#0096B7", linetype="dashed", size=0.75) +
facet_wrap(~ name, scales = "free") +
labs(x="", y="Density", title="Quick Overview of the aspects",
subtitle="Histogram for each numeric feature, with density and mean line")
split the data to train and test
set.seed(1234)
data_split <- initial_split(data, strata = RECOMMENDATION)
train <- training(data_split)
test <- testing(data_split)
set the linear model
RECOMMENDATION_linear <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE,data = train)
histogram of the predicton error
model_points <- get_regression_points(RECOMMENDATION_linear)
ggplot(model_points, aes(x = residual)) +
geom_histogram(bins = 50,color = "#000000", fill = "#0099F8") +
labs(
title = "Histogram of Recommendation Predicton of the linear model",
x = "Recommendation Prediction Error",
y = "Count"
) +
theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
)
mean squerd error of the linear model
mean(model_points$residual^2)
## [1] 0.4389945
KNN
knn_mod <- nearest_neighbor(mode="classification", neighbors=5) %>%
fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE + IMPACT, train)
knn_mod
## parsnip model object
##
##
## Call:
## kknn::train.kknn(formula = as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE + IMPACT, data = data, ks = min_rows(5, data, 5))
##
## Type of response variable: nominal
## Minimal misclassification: 0.4913043
## Best kernel: optimal
## Best k: 5
see knn predicton and real labels
knn_pred <- knn_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
knn_pred
conffesion metrix for knn
knn_pred %>%
conf_mat(RECOMMENDATION, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(Prediction, Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
labs(
title = "conffesion metrix of the knn model",
x = "Predicton",
y = "Recomndation Score"
)
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor
accuracy of knn
knn_pred$tf <- if_else(knn_pred$RECOMMENDATION == knn_pred$.pred_class, 1, 0)
sum(knn_pred$tf) / length(knn_pred$RECOMMENDATION)
## [1] 0.474359
Random Forest
rf_mod <- rand_forest(mode="classification") %>%
fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE + IMPACT, train)
rf_mod
## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 230
## Number of independent variables: 4
## Mtry: 2
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.3510778
see Random Forest predicton and real labels
rf_pred <- rf_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
rf_pred
conffesion metrix for Random Forest
rf_pred %>%
conf_mat(RECOMMENDATION, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(Prediction, Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
labs(
title = "conffesion metrix of the Random Forest model",
x = "Predicton",
y = "Recomndation Score"
)
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor
accuracy of Random Forest
rf_pred$tf <- if_else(rf_pred$RECOMMENDATION == rf_pred$.pred_class, 1, 0)
sum(rf_pred$tf) / length(rf_pred$RECOMMENDATION)
## [1] 0.5769231
Neural Network with one hidden layer and 13 neurons in it
nnet_mod <- mlp(mode="classification",
hidden_units = 13) %>%
fit(as.factor(RECOMMENDATION) ~ SUBSTANCE + CLARITY + REVIEWER_CONFIDENCE, train)
nnet_mod
## parsnip model object
##
## a 3-13-5 network with 122 weights
## inputs: SUBSTANCE CLARITY REVIEWER_CONFIDENCE
## output(s): as.factor(RECOMMENDATION)
## options were - softmax modelling
see the Neural Network predicton and real labels
nnet_pred <- nnet_mod %>% predict(test) %>% bind_cols(test %>% select(RECOMMENDATION))
nnet_pred
conffesion metrix for Neural Network
nnet_pred %>%
conf_mat(RECOMMENDATION, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(Prediction, Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8)+
labs(
title = "conffesion metrix of the Neural Network model",
x = "Predicton",
y = "Recomndation Score"
)
## Warning in vec2table(truth = truth, estimate = estimate, dnn = dnn, ...):
## `truth` was converted to a factor
accuracy of the Neural Network
nnet_pred$tf <- if_else(nnet_pred$RECOMMENDATION == nnet_pred$.pred_class, 1, 0)
sum(nnet_pred$tf) / length(nnet_pred$RECOMMENDATION)
## [1] 0.5641026
mean((as.numeric(nnet_pred$.pred_class) - as.numeric(nnet_pred$RECOMMENDATION))^2)
## [1] 0.8076923